library(countrycode)
library(lubridate)
library(ggplot2)
library(reshape)
library(caret)
library(doMC)
registerDoMC(4)
library(lme4)
#library(blme)

pdat <- read.csv("global_polling_replication_data.csv")
elections.training <- read.csv("elections_training_wround09082016.csv") #determined elections for training set

# Step 1: create essential features
pdat$wts <- scale(1/(pdat$days.to.elec+2), center=F) #+ scale(1/((abs(pdat$pro.incumbent)+1) ), center=F) +scale(pdat$n.elecs, center=F) #should do this by election
pdat$lgdp <- scale(log(pdat$GDP), center=F)
pdat$asp.gdp <- scale(pdat$asp.gdp, center=F)
pdat$pro.incumbent <- scale(pdat$pro.incumbent, center=F)
pdat$logpolls <- log(abs(pdat$pollmarg)+.00000001)
pdat$Round.Date <- as.Date(as.character(pdat$Round.Date), format="%m/%d/%Y")

#pdat.less <- na.omit(pdat[ pdat$electionid %in% elections.training$electionid, grepl("pollmarg|realmarg|pro.incumbent|incRun|asp.gdp|lgdp|electionid|ccode|Pollster|region", names(pdat))])
#pdat$sm.pollmarg <- NA

trainingsize <- 50
bad <- 1
# Step: creating the smoothed estimate for the training set
# Step: create loop creating smoothed estimate for each election in the testing data

pdat$sm.pollmarg <- NA

for(i in 1:sum(elections.training$trainingset==0)){
  
  
  # Remove missing polls and dates
  pdat <- pdat[!is.na(pdat$pollmarg), ]
  pdat <- pdat[!is.na(pdat$Poll.Date), ]
  
  #designate the elections over which to train at each iteration
  elecs <- pdat$electionid %in% elections.training$electionid[ 1:(trainingsize+i)] #get T/F in or out of training set
  #Simple linear model with complete pooling
  sm.mod <- lm(pollmarg ~ pro.incumbent + incRun + asp.gdp + Pollster + region, weights=wts, na.action=na.pass, data=pdat[ elecs, ]) 
  
  #######
  if(i==1){ #if it is the first round of the loop, predict the training set plus the first test election
    pdat$sm.pollmarg[ elecs] <- predict(sm.mod)    
  }
  #######
  if(i>1){ #if it is not the first round of the loop, predict just the next test election

    #produce a prediction of the poll for the current election, encompassing all prior information  
    next.elec <- pdat$electionid %in% elections.training$electionid[ trainingsize + i ] #get just the election to make smoothed pred.
    pdat$sm.pollmarg[ next.elec ] <- predict(sm.mod, newdata=pdat[next.elec, ]) #produces a prediction
    print(elections.training$electionid[ trainingsize + i ])
  }
  print(paste(i, "successfully completed"))
}

#Generate plot of estimated values from iterated smoothing S10
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, logpolls, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat)
sum(sm.agg$sm.pollmarg*sm.agg$realmarg >= 0, na.rm=T) / sum(!is.na(sm.agg$sm.pollmarg))
qplot(sm.pollmarg, realmarg, data=sm.agg, colour=year) + geom_smooth(method="loess") + ylab("Real Margin") + xlab("OLS Smoothed Margin (post-averaging)")


library(caret); set.seed(13243)
sm.agg <- sm.agg[order(sm.agg$Round.Date), ]
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut

trmode <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 

# Figure S11
sum( (trmode$pred$pred* trmode$pred$obs) > 0) / length(trmode$pred$pred)
trmode$pred$result <- ifelse((trmode$pred$pred* trmode$pred$obs) > 0, "correct", "incorrect")
qplot(pred, obs, data=trmode$pred, colour=result) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("Complete Pooling Model Prediction")
